library(tidyverse)
library(dplyr)
library(ggplot2)
library(rgdal)
library(tmap)
library(readxl)
library(ggrepel)
library(ggthemes)
library(scales)
library(sf)
library(ggmap)
library(DT)
library(ggmap)
library(htmltools)
library(leaflet)
library(quanteda)
library(stringr)
library(tidytext)
library(tidyverse)
library(tm)
library(wordcloud)
library(wesanderson)
library(qdap)
library(SnowballC)
ggmap::register_google(key = "AIzaSyB1-MjiXEIrgdT3FbflMLc8EUaQXVG3XVY")

Kickstarter Projects

str(df)
'data.frame':   125926 obs. of  24 variables:
 $ backers_count           : int  4 35 310 1 36 22 10 187 5 1 ...
 $ blurb                   : chr  "Soaps made with love, care, creativity and you in mind, directly from South Texas." "Pens made from Whiskey barrels, Jack Daniel's Maker's Mark, Wild Turkey and Jim Beam" "Finally we have a building but we need your help to kick start it and get going. We are calling EVERYONE who believes in us." "Everything home made in one store. From jewelry to small furniture to clothing to soaps bath bombs and candles "| __truncated__ ...
 $ converted_pledged_amount: int  41 2205 8861 100 1026 1495 2591 4515 48 1 ...
 $ country                 : chr  "USA" "USA" "USA" "USA" ...
 $ country_displayable_name: chr  "the United States" "the United States" "the United States" "the United States" ...
 $ created_at              : chr  "2017-12-04" "2018-06-07" "2015-09-23" "2014-10-04" ...
 $ currency                : chr  "USD" "USD" "USD" "USD" ...
 $ deadline                : chr  "2018-01-04" "2018-07-11" "2015-11-15" "2014-11-07" ...
 $ goal                    : num  150 900 8500 50000 800 12000 24000 4000 200 11900 ...
 $ id                      : int  123246984 1207132794 1899686686 1081353908 84306631 1791742083 1569598343 111033308 1023343641 234361295 ...
 $ is_starrable            : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ launched_at             : chr  "2017-12-05" "2018-06-11" "2015-10-01" "2014-10-08" ...
 $ name                    : chr  "Soaps in Texas" "Whiskey Pens" "The Posh Factory" "Home Made" ...
 $ pledged                 : num  41 2205 8861 100 1026 ...
 $ slug                    : chr  "soaps-in-texas" "whiskey-pens" "the-posh-factory" "home-made" ...
 $ source_url              : chr  "https://www.kickstarter.com/discover/categories/crafts/diy" "https://www.kickstarter.com/discover/categories/crafts" "https://www.kickstarter.com/discover/categories/dance" "https://www.kickstarter.com/discover/categories/crafts/diy" ...
 $ spotlight               : logi  FALSE TRUE TRUE FALSE TRUE FALSE ...
 $ staff_pick              : logi  FALSE FALSE FALSE FALSE TRUE FALSE ...
 $ state                   : chr  "failed" "successful" "successful" "failed" ...
 $ state_changed_at        : chr  "2018-01-04" "2018-07-11" "2015-11-15" "2014-11-07" ...
 $ location_town           : chr  "Edinburg" "Columbia" "Jacksonville" "Detroit" ...
 $ location_state          : chr  "TX" "SC" "FL" "MI" ...
 $ top_category            : chr  "crafts" "crafts" "dance" "crafts" ...
 $ sub_category            : chr  "diy" "woodworking" "spaces" "diy" ...
df$state_changed_at <- as.Date(df$state_changed_at)
my_number <- length(unique(df$top_category))
my_colors <- wes_palette("FantasticFox1", my_number, type = "continuous")
my_colors

1. Identifying Successful Projects

a) Success by Category

There are several ways to identify success of a project:
- State (state): Whether a campaign was successful or not.
- Pledged Amount (pledged)
- Achievement Ratio: The variable achievement_ratio is calculating the percentage of the original monetary goal reached by the actual amount pledged (that is pledged\goal *100).
- Number of backers (backers_count)
- How quickly the goal was reached (difference between launched_at and state_changed_at) for those campaigns that were successful.

Use one or more of these measures to visually summarize which categories were most successful in attracting funding on kickstarter. Briefly summarize your findings.

df$achievement_ratio <- (df$pledged / df$goal) * 100
df$realizing_time <- ifelse(df$state == "successful", as.Date(df$state_changed_at) - as.Date(df$launched_at), NA)

by State (successful or not)

df %>%
  dplyr::select(`top_category`, `state`) %>%
  group_by(`top_category`, `state`) %>%
  summarize(counts = n()) %>%

  ggplot(aes(x = `top_category`, y = counts)) +
  geom_bar(
    aes(color = `state`, fill = `state`),
    stat = "identity", position = position_dodge(0.8),
    width = 0.7) +
  
  labs(
  title = "Kickstarter Projects state by category",
  x = "",
  y = "Number of Projects",
  x.axis = "",
  color = "State of the Project",
  fill = "State of the Project"
  ) +
  
  theme_tufte() + 
  theme(
    plot.title = element_text(size = 13, face = "bold"),
    plot.caption = element_text(hjust = 0, face = "italic"),
    plot.background = element_rect(),
    
    legend.key = element_rect(fill = "white", colour = "white"),
    legend.background = element_rect()
    ) + 
  
  scale_y_continuous(labels = comma) +
  scale_color_manual(values = c("#E17C03", "#B40F20", "#5EA6A9", "#8EBD6B"))+
  scale_fill_manual(values = c("#E17C03", "#B40F20", "#5EA6A9", "#8EBD6B")) +
  scale_x_discrete(guide = guide_axis(n.dodge=3))

At a first glance Music is the most successful while Crafts, Food, Journalism, and Photography seem riskier categories on Kickstarter.

by pledges

df %>%
  dplyr::select(`top_category`, `pledged`, `state_changed_at`, name) %>%
  
  ggplot(aes(x = state_changed_at, y = pledged, color = top_category, size = pledged)) +
  geom_point(na.rm=TRUE) +
  geom_label_repel(data=subset(df, pledged > 7500000), aes(label=(name)), show.legend = FALSE, min.segment.length = 0, seed = 42, box.padding = 0.5, size = 2.5) +
  
  scale_color_manual(values = my_colors) +
  
  labs(
  title = "Kickstarter Projects pledges",
  x = "Years",
  y = "Money pledged",
  x.axis = "",
  color = "Category"
  ) +
  
  theme_tufte() + 
  theme(
    plot.title = element_text(size = 13, face = "bold"),
    plot.caption = element_text(hjust = 0, face = "italic"),
    plot.background = element_rect(),
    
    axis.title.x = element_text(hjust = 0.475),
    
    legend.key = element_rect(fill = "white", colour = "white"),
    legend.background = element_rect()
    ) + 
  
  scale_size(guide="none") +
  
  scale_y_continuous(labels = comma)

Now we know that the four best project ever were in film, games, and technology. Moreover, it seems that, on average, technology receives a lot of pledges.

by Median Achievement Ratio

df %>%
  dplyr::select(`top_category`, `achievement_ratio`) %>%
  group_by(`top_category`) %>%
  summarize(`Median Achievement Ratio` = median(achievement_ratio, na.rm = TRUE)) %>%
  
  ggplot(aes(x = top_category, y = `Median Achievement Ratio`, fill = top_category)) +
  geom_col() +
 
  scale_fill_manual(values = my_colors, aesthetics = "fill") +
  geom_hline(yintercept=100) +
  
  labs(
  title = "Median Achievement per category",
  x = "",
  y = "Percentage (%)",
  x.axis = "",
  fill = "Category"
  ) +
  
  theme_tufte() + 
  theme(
    plot.title = element_text(size = 13, face = "bold"),
    plot.caption = element_text(hjust = 0, face = "italic"),
    plot.background = element_rect(),
   
    
    legend.key = element_rect(fill = "white", colour = "white"),
    legend.background = element_rect()
    ) + 
  
  scale_y_continuous(labels = comma) +
  scale_x_discrete(guide = guide_axis(n.dodge=3))

Median pledges per category

df %>%
  dplyr::select(`top_category`, `pledged`) %>%
  group_by(`top_category`) %>%
  summarize(`Median Pledges` = median(pledged, na.rm = TRUE)) %>%
  
  ggplot(aes(x = top_category, y = `Median Pledges`, fill = top_category)) +
  geom_col() +
 
  scale_fill_manual(values = my_colors, aesthetics = "fill") +
  
  labs(
  title = "Median Pledges per category",
  x = "",
  y = "Dollar ($)",
  x.axis = "",
  fill = "Category"
  ) +
  
  theme_tufte() + 
  theme(
    plot.title = element_text(size = 13, face = "bold"),
    plot.caption = element_text(hjust = 0, face = "italic"),
    plot.background = element_rect(),
   
    
    legend.key = element_rect(fill = "white", colour = "white"),
    legend.background = element_rect()
    ) + 
  
  scale_y_continuous(labels = comma) +
  scale_x_discrete(guide = guide_axis(n.dodge=3))

Success rate per category

success_rate_per_cat <- df %>%
  dplyr::select(`top_category`, `state`) %>%
  filter(`state` == "successful" |`state` == "failed") %>%
  group_by(`top_category`) %>%
  summarize(`Successful Projects` = sum(`state` == "successful", na.rm = TRUE), `Total Projects` = sum(`state` == "successful" | `state` == "failed" , na.rm = TRUE)) %>%
  mutate(`Success Rate` = (`Successful Projects`/ `Total Projects`) *100)

success_rate_per_cat
mean(success_rate_per_cat$`Success Rate`)
[1] 58.30427

BONUS ONLY: b) Success by Location

Now, use the location information to calculate the total number of successful projects by state (if you are ambitious, normalize by population). Also, identify the Top 50 “innovative” cities in the U.S. (by whatever measure you find plausible). Provide a leaflet map showing the most innovative states and cities in the U.S. on a single map based on these information.

df$location <- paste(df$location_town, df$location_state, sep = ", ")
states_pj <- df %>%
  group_by(location_state) %>%
  count(location_state) %>%
  arrange(desc(n))

cities_pg <- df %>%
  group_by(location) %>%
  count(location) %>%
  arrange(desc(n)) %>%
  head(50)

datatable(states_pj, filter = 'top', colnames = c("State", "Projects")) 
datatable(cities_pg,filter = 'top', colnames = c("Top innovative cities", "Projects")) 

If we’d consider Brooklyn as part of NYC, NYC would be the most innovative city. As a measure of a city innovation I simply used the amount of projects that come from there. Now, the whole list of cities is 8,762, which is too much to locate. I need to subset this list. I will keep just the first 100 most innovative cities.

top100_cities <- df %>%
  group_by(location) %>%
  count(location) %>%
  arrange(desc(n)) %>%
  head(100)
GeoCoded <- purrr::map_df(.x = top100_cities$location, .f = ggmap::geocode)
geocoded_df <- dplyr::bind_cols(top100_cities, GeoCoded) %>% 
  dplyr::select(
    lng = lon,
    lat,
    dplyr::everything())

I intalled the “tigris” package. However, I’m not very familiar with it and I found it more simple to use the shape file with the states from https://www.census.gov/geographies/mapping-files/time-series/geo/carto-boundary-file.html .

us <- readOGR(dsn = '/Users/davidev7/Documents/Columbia/Second Semester/Data Visualization/Assignments/HW3/cb_2018_us_state_500k', layer = "cb_2018_us_state_500k")
OGR data source with driver: ESRI Shapefile 
Source: "/Users/davidev7/Documents/Columbia/Second Semester/Data Visualization/Assignments/HW3/cb_2018_us_state_500k", layer: "cb_2018_us_state_500k"
with 56 features
It has 9 fields
Integer64 fields read as strings:  ALAND AWATER 
states_pj$popuptext <- base::paste0("<b>", 
                                 "Number or Projects: ",
                                 "</b><br />",
                                 states_pj$n)
projects_by_state <- merge(us, states_pj, by.x = "STUSPS", by.y = "location_state")
geocoded_df$popuptext <- base::paste0("<b>", 
                                 "Number or Projects: ",
                                 "</b><br />",
                                 geocoded_df$n)
bins <- c(200, 500, 1000, 3000, 5000, 8000, 10000, Inf)
pal <- colorBin("RdYlBu", domain = states_pj$n, bins = bins)
map_innovation <- leaflet(geocoded_df) %>%
  setView(lng = -98.5795, lat = 39.8283, zoom = 2.5) %>%
  addProviderTiles(providers$CartoDB.PositronNoLabels) %>%
  addPolygons(data = projects_by_state, 
            color = "white", 
            weight = 2, 
            smoothFactor = 0.5,
            opacity = 1.0, 
            fillOpacity = 0.9,
            dashArray = "2",
            fillColor = ~pal(n),
            highlightOptions = highlightOptions(color = "red", weight = 2,
                                                bringToFront = TRUE),
            popup = ~popuptext ,
            group = "States",
            label = ~htmlEscape(as.character(NAME))) %>%
  addCircleMarkers(color = "orange",
             label = ~htmlEscape(as.character(location)),
             fill = TRUE,
             lng = ~lng, 
             lat = ~lat,
             popup = ~popuptext,
             group = "Cities") %>%
  addLayersControl(
     baseGroups = c("States", "Cities"),
     options = layersControlOptions(collapsed = FALSE)
  )

map_innovation

2. Writing your success story

Each project contains a blurb – a short description of the project. While not the full description of the project, the short headline is arguably important for inducing interest in the project (and ultimately popularity and success). Let’s analyze the text.

a) Cleaning the Text and Word Cloud

To reduce the time for analysis, select the 1000 most successful projects and a sample of 1000 unsuccessful projects. Use the cleaning functions introduced in lecture (or write your own in addition) to remove unnecessary words (stop words), syntax, punctuation, numbers, white space etc. Note, that many projects use their own unique brand names in upper cases, so try to remove these fully capitalized words as well (since we are aiming to identify common words across descriptions). Create a document-term-matrix.

Provide a word cloud of the most frequent or important words (your choice which frequency measure you choose) among the most successful projects.

removeNumPunct <- function(x){gsub("[^[:alpha:][:space:][-]]*", "", x)}

# my own functions
removeBrand <- function (x) {gsub("\\b[A-Z]+\\b", "", x)}
subDash <- function (x) {gsub("[-]", " ", x)}

# function seen in class
clean_corpus <- function(corpus){
  corpus <- tm_map(corpus, content_transformer(removeBrand))
  corpus <- tm_map(corpus, content_transformer(removeNumPunct))
  corpus <- tm_map(corpus, content_transformer(subDash))
  corpus <- tm_map(corpus, content_transformer(removePunctuation))
  corpus <- tm_map(corpus, content_transformer(tolower))
  corpus <- tm_map(corpus, removeWords, c(stopwords("en")))
    # Optionally, one could add more stop words
  corpus <- tm_map(corpus, stripWhitespace)
  return(corpus)
}

# following the advice of comparing original text and cleaned
# df_cleaned <- clean_corpus(df)

# finally the stemming process
stemCompletion2 <- function(x, dictionary) {
   x <- unlist(strsplit(as.character(x), " "))
   x <- x[x != ""]
   x <- stemCompletion(x, dictionary=dictionary)
   x <- paste(x, sep="", collapse=" ")
   PlainTextDocument(stripWhitespace(x))
}
most_successful_pj <- df %>%
  filter(state == "successful") %>%
  arrange(desc(`pledged`)) %>%
  head(1000)

unsuccessful_pj <- df %>%
  filter(state == "failed") %>%
  sample_n(1000)
df_source_successful <- most_successful_pj %>%
  dplyr::select(`id`, `blurb`) %>%
  rename(doc_id = id, text = blurb)
df_source_un <- unsuccessful_pj %>%
  dplyr::select(`id`, `blurb`) %>%
  rename(doc_id = id, text = blurb)
VCorpus(DataframeSource(df_source_successful))
<<VCorpus>>
Metadata:  corpus specific: 0, document level (indexed): 0
Content:  documents: 1000
s_corpus_raw <- Corpus(VectorSource(df_source_successful$text))
s_corpus_raw$meta$id <- most_successful_pj$id

s_corpus_raw[[1]]$content
[1] "Critical Role's The Legend of Vox Machina reunites your favorite heroes for a professional-quality animated special!"
s_corpus_raw[[2]]$content
[1] "A revolutionary table that evolves over a lifetime. Innovative, yet affordable, with magnetic accessories. Crafted without compromise."
s_corpus_raw[[3]]$content
[1] "Cracking open the last closed platform: the TV.  A beautiful, affordable console -- built on Android, by the creator of Jambox."
u_corpus_raw <- Corpus(VectorSource(df_source_un$text))
u_corpus_raw$meta$id <- unsuccessful_pj$id
s_corpus_cleaned <- clean_corpus(s_corpus_raw)

s_corpus_cleaned[[1]]$content
[1] "critical roles legend vox machina reunites favorite heroes professional quality animated special"
s_corpus_cleaned[[2]]$content
[1] " revolutionary table evolves lifetime innovative yet affordable magnetic accessories crafted without compromise"
s_corpus_cleaned[[3]]$content
[1] "cracking open last closed platform beautiful affordable console built android creator jambox"
s_corpus_stemmed <- tm_map(s_corpus_cleaned, stemDocument)
s_corpus_final <- tm_map(s_corpus_stemmed, stemCompletion2, dictionary = s_corpus_cleaned)
u_corpus_cleaned <- clean_corpus(u_corpus_raw)
u_corpus_stemmed <- tm_map(u_corpus_cleaned, stemDocument)
u_corpus_final <- tm_map(u_corpus_stemmed, stemCompletion2, dictionary = u_corpus_cleaned)
successful_dtm <- DocumentTermMatrix(s_corpus_final)
successful_dtm$dimnames$Docs <- as.character(s_corpus_final$meta$id)
un_dtm <- DocumentTermMatrix(u_corpus_final)
un_dtm$dimnames$Docs <- as.character(u_corpus_final$meta$id)
successful_dtm_tidy <- tidy(successful_dtm)

successful_terms <- successful_dtm_tidy %>%
  group_by(term) %>%
  summarise(n = sum(count)) %>%
  arrange(desc(n)) %>%
  top_n(100)
un_dtm_tidy <- tidy(un_dtm)

unsuccessful_terms <- un_dtm_tidy %>%
  group_by(term) %>%
  summarise(n = sum(count)) %>%
  arrange(desc(n)) %>%
  top_n(100)
my_colors2 <- my_colors[c(8,1,14)]
wordcloud(words = successful_terms$term, 
          freq = as.integer(successful_terms$n),
          scale = c(4, .5),
          min.freq = 0, 
          max.words = 100,
          colors = my_colors2
          )

Since I’ve already done most of the work, I’m interested in a comparison with failed projects.

wordcloud(words = unsuccessful_terms$term, 
          freq = as.integer(unsuccessful_terms$n),
          scale = c(4, .5),
          min.freq = 0, 
          max.words = 100,
          colors = my_colors
          )

Apparently a cry for help doesn’t help! New appears to be equally frequent in both. While first is a signature for successful projects!

b) Success in words

Provide a pyramid plot to show how the words between successful and unsuccessful projects differ in frequency. A selection of 10 - 20 top words is sufficient here.

all_successful_terms <- successful_dtm_tidy %>%
  group_by(term) %>%
  summarise(n = sum(count)) %>%
  arrange(desc(n))

all_unsuccessful_terms <- un_dtm_tidy %>%
  group_by(term) %>%
  summarise(n = sum(count)) %>%
  arrange(desc(n))
terms <- merge(all_successful_terms, all_unsuccessful_terms, by = "term")
terms_for_plot <- terms %>% 
  filter(n.x > 30 | n.y >30)
terms_for_plot <- terms_for_plot[c(1, 7, 16, 19, 24, 27, 20, 29, 33, 35, 36, 50, 52, 53, 41, 42, 48, 12, 9, 3),]
scol<- my_colors[6]
ucol<- my_colors[14]
pyramid.plot(terms_for_plot$n.x, terms_for_plot$n.y, labels=terms_for_plot$term,
  main="Words frequency pyramid plot",
  top.labels = c("Successful", "Term", "Unsuccessful"),
  lxcol=scol,rxcol=ucol,
  laxlab = NULL,
             raxlab = NULL,
             unit = NULL,
             labelcex=0.5,
  gap=23,show.values=TRUE)
101 101 
[1] 5.1 4.1 2.1 2.1

It’s a very interesting graph from a psychological perspective. The more the words are “needy” the more is likely the project is going to fail. The same can be said for those projects in which things are not done yet. Words like “will” “make” are associated with failure.

c) Simplicity as a virtue

These blurbs are short in length (max. 150 characters) but let’s see whether brevity and simplicity still matters. Calculate a readability measure (Flesh Reading Ease, Flesh Kincaid or any other comparable measure) for the texts. Visualize the relationship between the readability measure and one of the measures of success. Briefly comment on your finding.

s_pj <- most_successful_pj %>%
  dplyr::select(`blurb`, `state`, `goal`, `achievement_ratio`) %>%
  filter(`goal` > 1000)

u_pj <- unsuccessful_pj %>%
  dplyr::select(`blurb`, `state`, `goal`, `achievement_ratio`) %>%
  filter(`goal` > 1000)

df_merged <- rbind(s_pj, u_pj)
readability_score <- textstat_readability(as.character(df_merged$blurb), 
        measure = c('Flesch','Flesch.Kincaid',
                  'meanSentenceLength','meanWordSyllables'))

readability_score[colnames(df_merged)] <- df_merged
readability_score %>%
  filter(log(achievement_ratio) > -10) %>%
  
  ggplot(aes(x = log(achievement_ratio), y = Flesch.Kincaid, color = state)) +
  geom_point() +
  geom_smooth(color = my_colors[8]) +
  
  scale_color_manual(values = c(ucol, scol)) +
  
  labs(
    title = "Relationship between Readability and Success", 
    x = "log Achievement Ratio", 
    y = "Flesch-Kincaid Grade Level",
    color = "State"
    ) +
  
  
  theme_tufte() + 
  theme(
    plot.title = element_text(size = 13, face = "bold"),
    plot.caption = element_text(hjust = 0, face = "italic"),
    plot.background = element_rect(),
    
    axis.title.x = element_text(hjust = 0.475),
    
    legend.key = element_rect(fill = "white", colour = "white"),
    legend.background = element_rect()
    )

There is a slightly, but negligible, positive relationship between having a description with an higher Flesch-Kincaid score and having an higher Achievement Ratio (Money pledged / Goal, when Goal > 1000). I filtered for Goal > 1000 because many projects set very low (unrealistic) goals. The Flesch-Kincaid scale measures how easily readable a text is. The higher the score the easier the text is easy to read.

3. Sentiment

Now, let’s check whether the use of positive / negative words or specific emotions helps a project to be successful.

a) Stay positive

Calculate the tone of each text based on the positive and negative words that are being used. You can rely on the Hu & Liu dictionary provided in lecture or use the Bing dictionary contained in the tidytext package (tidytext::sentiments). Visualize the relationship between tone of the document and success. Briefly comment.

After consulting https://www.tidytextmining.com/sentiment.html, I selected the “nrc” dictionary from Saif Mohammad and Peter Turney as it’s the one with more words.

dataframe1 <- data.frame(text=sapply(s_corpus_cleaned, identity), 
    stringsAsFactors=F)
dataframe1 <- dataframe1 %>% 
  mutate(index = row_number())

df1.1 <- most_successful_pj %>%
  dplyr::select(`state`, `goal`, `achievement_ratio`) %>%
  mutate(index = row_number())

dataframe1.2 <- left_join(dataframe1, df1.1, by = "index")
dataframe2 <- data.frame(text=sapply(u_corpus_cleaned, identity), 
    stringsAsFactors=F)
dataframe2 <- dataframe2 %>% 
  mutate(index = row_number())

df2.1 <- unsuccessful_pj %>%
  dplyr::select(`state`, `goal`, `achievement_ratio`) %>%
  mutate(index = row_number())

dataframe2.2 <- left_join(dataframe2, df2.1, by = "index")
df_merged_cleaned <- rbind(dataframe1.2, dataframe2.2)
library(tidytext)
library(syuzhet)

polarity_score <- get_sentiment(df_merged_cleaned$text, method="nrc")
log(100)
[1] 4.60517

When Achievement Rate = 100 the project reached its goal.

df_merged_cleaned %>% 
  filter(goal > 1000) %>%
  filter(log(achievement_ratio) > -8) %>%
 
  ggplot(aes(x = sentiment, y = log(achievement_ratio), color = state)) + 
  geom_point() +
  geom_smooth(color = my_colors[8]) +
  
  geom_hline(yintercept = 4.60517) +
  geom_label(
    label="Goal reached", 
    x=6.35,
    y=6,
    label.padding = unit(0.45, "lines"),
    label.size = 0.15,
    color = "black",
    fill="honeydew2"
  ) +
  
  scale_color_manual(values = c(ucol, scol)) +
  
  labs(
    title = "Relationship between Text Sentiment and Success", 
    x = "Text Sentiment", 
    y = "log Achievement Ratio",
    color = "State"
    ) +
  
  
  theme_tufte() + 
  theme(
    plot.title = element_text(size = 13, face = "bold"),
    plot.caption = element_text(hjust = 0, face = "italic"),
    plot.background = element_rect(),
    
    axis.title.x = element_text(hjust = 0.475),
    
    legend.key = element_rect(fill = "white", colour = "white"),
    legend.background = element_rect()
    )

This is a weird result. Apparently, once the text of a project goes beyond a slightly positive sentiment, its success chances decrease. However, it might make sense! Extremely positive texts might be suspicious. Projects might appear too good to be true.

b) Positive vs negative

Segregate all 2,000 blurbs into positive and negative texts based on their polarity score calculated in step (a). Now, collapse the positive and negative texts into two larger documents. Create a document-term-matrix based on this collapsed set of two documents. Generate a comparison cloud showing the most-frequent positive and negative words.

df_merged_cleaned$polarity <- ifelse(df_merged_cleaned$sentiment < 0, "Negative", ifelse(df_merged_cleaned$sentiment >= 0, "Positive", NA))
df_positive <- df_merged_cleaned %>%
  filter(polarity == "Positive")

df_negative <- df_merged_cleaned %>%
  filter(polarity == "Negative")
df_3b1 <- unnest_tokens(df_merged_cleaned, output = word, input = text) 

df_3b <- df_3b1 %>%
  group_by(polarity, word) %>%
  summarise(count = n()) %>%
  cast_tdm(word, polarity, count)

matrix3b <- as.matrix(df_3b)
set.seed(2105)
comparison.cloud(matrix3b, colors = c(ucol, scol), 
                 title.size= 1.3,
                 max.words = 100,
                 random.order = FALSE)

c) Get in their mind

Now, use the NRC Word-Emotion Association Lexicon in the tidytext package to identify a larger set of emotions (anger, anticipation, disgust, fear, joy, sadness, surprise, trust). Again, visualize the relationship between the use of words from these categories and success. What is your finding?

sentiments <- get_sentiments("nrc")
df_sentiments <-  inner_join(df_3b1, sentiments, by = "word")
df_sentiments %>%
  
  ggplot(aes(x = sentiment.y, y = achievement_ratio, color = state)) + 
  geom_point() +
  
  scale_color_manual(values = c(ucol, scol)) +
  
  labs(
    title = "Relationship between Sentiments and Success", 
    x = "Sentiments", 
    y = "Achievement Ratio",
    color = "State"
    ) +
  
  
  theme_tufte() + 
  theme(
    plot.title = element_text(size = 13, face = "bold"),
    plot.caption = element_text(hjust = 0, face = "italic"),
    plot.background = element_rect(),
    
    legend.key = element_rect(fill = "white", colour = "white"),
    legend.background = element_rect()
    )

The most successful projects have words that induce positive emotions like Anticipation, Joy, Positive, Surprise, and Trust. There is also a successful project that has Anger and Negative. However, none of the most successful has Disgust or Fear or Sadness. Moreover, Positive and Trust seem to be the two with the highest correlation with the Achievement Ratio.

---
title: 'Assignment 3: Kickstarter Projects'
author: "Davide Vaccari"
date: "3/31/2021"
output: 
  html_document:
    toc: TRUE
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, error = FALSE, warning = FALSE, message = FALSE)
knitr::opts_knit$set(root.dir = normalizePath("/Users/davidev7/Documents/Columbia/Second Semester/Data Visualization/Assignments/HW3"))
```

```{r}
library(tidyverse)
library(dplyr)
library(ggplot2)
library(rgdal)
library(tmap)
library(readxl)
library(ggrepel)
library(ggthemes)
library(scales)
library(sf)
library(ggmap)
library(DT)
library(ggmap)
library(htmltools)
library(leaflet)
library(quanteda)
library(stringr)
library(tidytext)
library(tidyverse)
library(tm)
library(wordcloud)
library(wesanderson)
library(qdap)
library(SnowballC)
library(plotrix)
library(magrittr)
```

```{r}
ggmap::register_google(key = "AIzaSyB1-MjiXEIrgdT3FbflMLc8EUaQXVG3XVY")
```

Kickstarter Projects
================================

```{r}
df <- read.csv("/Users/davidev7/Documents/Columbia/Second Semester/Data Visualization/course_content-main/Exercises/09_kickstarter/kickstarter_projects_2021-03.csv", stringsAsFactors = FALSE)

str(df)
```

```{r}
df$state_changed_at <- as.Date(df$state_changed_at)
```

```{r}
my_number <- length(unique(df$top_category))
my_colors <- wes_palette("FantasticFox1", my_number, type = "continuous")
my_colors
```

## 1. Identifying Successful Projects

### a) Success by Category

There are several ways to identify success of a project:  
  - State (`state`): Whether a campaign was successful or not.   
  - Pledged Amount (`pledged`)   
  - Achievement Ratio: The variable `achievement_ratio` is calculating the percentage of the original monetary `goal` reached by the actual amount `pledged` (that is `pledged`\\`goal` *100).    
  - Number of backers (`backers_count`)  
  - How quickly the goal was reached (difference between `launched_at` and `state_changed_at`) for those campaigns that were successful.  

Use one or more of these measures to visually summarize which categories were most successful in attracting funding on kickstarter. Briefly summarize your findings.

```{r}
df$achievement_ratio <- (df$pledged / df$goal) * 100
df$realizing_time <- ifelse(df$state == "successful", as.Date(df$state_changed_at) - as.Date(df$launched_at), NA)
```

```{r}
head(df)
```

#### by State (successful or not)

```{r}
df %>%
  dplyr::select(`top_category`, `state`) %>%
  group_by(`top_category`, `state`) %>%
  summarize(counts = n()) %>%

  ggplot(aes(x = `top_category`, y = counts)) +
  geom_bar(
    aes(color = `state`, fill = `state`),
    stat = "identity", position = position_dodge(0.8),
    width = 0.7) +
  
  labs(
  title = "Kickstarter Projects state by category",
  x = "",
  y = "Number of Projects",
  x.axis = "",
  color = "State of the Project",
  fill = "State of the Project"
  ) +
  
  theme_tufte() + 
  theme(
    plot.title = element_text(size = 13, face = "bold"),
    plot.caption = element_text(hjust = 0, face = "italic"),
    plot.background = element_rect(),
    
    legend.key = element_rect(fill = "white", colour = "white"),
    legend.background = element_rect()
    ) + 
  
  scale_y_continuous(labels = comma) +
  scale_color_manual(values = c("#E17C03", "#B40F20", "#5EA6A9", "#8EBD6B"))+
  scale_fill_manual(values = c("#E17C03", "#B40F20", "#5EA6A9", "#8EBD6B")) +
  scale_x_discrete(guide = guide_axis(n.dodge=3))
```

At a first glance Music is the most successful while Crafts, Food, Journalism, and Photography seem riskier categories on Kickstarter.

#### by pledges

```{r}
df %>%
  dplyr::select(`top_category`, `pledged`, `state_changed_at`, name) %>%
  
  ggplot(aes(x = state_changed_at, y = pledged, color = top_category, size = pledged)) +
  geom_point(na.rm=TRUE) +
  geom_label_repel(data=subset(df, pledged > 7500000), aes(label=(name)), show.legend = FALSE, min.segment.length = 0, seed = 42, box.padding = 0.5, size = 2.5) +
  
  scale_color_manual(values = my_colors) +
  
  labs(
  title = "Kickstarter Projects pledges",
  x = "Years",
  y = "Money pledged",
  x.axis = "",
  color = "Category"
  ) +
  
  theme_tufte() + 
  theme(
    plot.title = element_text(size = 13, face = "bold"),
    plot.caption = element_text(hjust = 0, face = "italic"),
    plot.background = element_rect(),
    
    axis.title.x = element_text(hjust = 0.475),
    
    legend.key = element_rect(fill = "white", colour = "white"),
    legend.background = element_rect()
    ) + 
  
  scale_size(guide="none") +
  
  scale_y_continuous(labels = comma)
```

Now we know that the four best project ever were in film, games, and technology. Moreover, it seems that, on average, technology receives a lot of pledges.

#### by Median Achievement Ratio

```{r}
df %>%
  dplyr::select(`top_category`, `achievement_ratio`) %>%
  group_by(`top_category`) %>%
  summarize(`Median Achievement Ratio` = median(achievement_ratio, na.rm = TRUE)) %>%
  
  ggplot(aes(x = top_category, y = `Median Achievement Ratio`, fill = top_category)) +
  geom_col() +
 
  scale_fill_manual(values = my_colors, aesthetics = "fill") +
  geom_hline(yintercept=100) +
  
  labs(
  title = "Median Achievement per category",
  x = "",
  y = "Percentage (%)",
  x.axis = "",
  fill = "Category"
  ) +
  
  theme_tufte() + 
  theme(
    plot.title = element_text(size = 13, face = "bold"),
    plot.caption = element_text(hjust = 0, face = "italic"),
    plot.background = element_rect(),
   
    
    legend.key = element_rect(fill = "white", colour = "white"),
    legend.background = element_rect()
    ) + 
  
  scale_y_continuous(labels = comma) +
  scale_x_discrete(guide = guide_axis(n.dodge=3))
```

#### Median pledges per category

```{r}
df %>%
  dplyr::select(`top_category`, `pledged`) %>%
  group_by(`top_category`) %>%
  summarize(`Median Pledges` = median(pledged, na.rm = TRUE)) %>%
  
  ggplot(aes(x = top_category, y = `Median Pledges`, fill = top_category)) +
  geom_col() +
 
  scale_fill_manual(values = my_colors, aesthetics = "fill") +
  
  labs(
  title = "Median Pledges per category",
  x = "",
  y = "Dollar ($)",
  x.axis = "",
  fill = "Category"
  ) +
  
  theme_tufte() + 
  theme(
    plot.title = element_text(size = 13, face = "bold"),
    plot.caption = element_text(hjust = 0, face = "italic"),
    plot.background = element_rect(),
   
    
    legend.key = element_rect(fill = "white", colour = "white"),
    legend.background = element_rect()
    ) + 
  
  scale_y_continuous(labels = comma) +
  scale_x_discrete(guide = guide_axis(n.dodge=3))
```

#### Success rate per category

```{r}
success_rate_per_cat <- df %>%
  dplyr::select(`top_category`, `state`) %>%
  filter(`state` == "successful" |`state` == "failed") %>%
  group_by(`top_category`) %>%
  summarize(`Successful Projects` = sum(`state` == "successful", na.rm = TRUE), `Total Projects` = sum(`state` == "successful" | `state` == "failed" , na.rm = TRUE)) %>%
  mutate(`Success Rate` = (`Successful Projects`/ `Total Projects`) *100)

success_rate_per_cat
```

```{r}
mean(success_rate_per_cat$`Success Rate`)
```


```{r}
success_rate_per_cat %>%
  
 ggplot(aes(x = top_category, y = `Success Rate`, fill = top_category)) +
  geom_col() +
 
  scale_fill_manual(values = my_colors, aesthetics = "fill") +
  geom_hline(yintercept = 58.30427) +
  geom_label(
    label="Average Success Rate", 
    x=7.5,
    y=52,
    label.padding = unit(0.55, "lines"),
    label.size = 0.15,
    color = "black",
    fill="honeydew2"
  ) +
  
  labs(
  title = "Success Rate per category",
  x = "",
  y = "Percentage (%)",
  x.axis = "",
  fill = "Category"
  ) +
  
  theme_tufte() + 
  theme(
    plot.title = element_text(size = 13, face = "bold"),
    plot.caption = element_text(hjust = 0, face = "italic"),
    plot.background = element_rect(),
   
    
    legend.key = element_rect(fill = "white", colour = "white"),
    legend.background = element_rect()
    ) + 
  
  scale_y_continuous(labels = comma) +
  scale_x_discrete(guide = guide_axis(n.dodge=3))
```

### **BONUS ONLY:** b) Success by Location

Now, use the location information to calculate the total number of successful projects by state (if you are ambitious, normalize by population). Also, identify the Top 50 "innovative" cities in the U.S. (by whatever measure you find plausible). Provide a leaflet map showing the most innovative states and cities in the U.S. on a single map based on these information.

```{r}
df$location <- paste(df$location_town, df$location_state, sep = ", ")
```

```{r}
states_pj <- df %>%
  group_by(location_state) %>%
  count(location_state) %>%
  arrange(desc(n))

cities_pg <- df %>%
  group_by(location) %>%
  count(location) %>%
  arrange(desc(n)) %>%
  head(50)

datatable(states_pj, filter = 'top', colnames = c("State", "Projects")) 
datatable(cities_pg,filter = 'top', colnames = c("Top innovative cities", "Projects")) 
```

If we'd consider Brooklyn as part of NYC, NYC would be the most innovative city. As a measure of a city innovation I simply used the amount of projects that come from there.
Now, the whole list of cities is 8,762, which is too much to locate. I need to subset this list. I will keep just the first 100 most innovative cities.

```{r}
top100_cities <- df %>%
  group_by(location) %>%
  count(location) %>%
  arrange(desc(n)) %>%
  head(100)
```


```{r, results='hide'}
GeoCoded <- purrr::map_df(.x = top100_cities$location, .f = ggmap::geocode)
```

```{r}
geocoded_df <- dplyr::bind_cols(top100_cities, GeoCoded) %>% 
  dplyr::select(
    lng = lon,
    lat,
    dplyr::everything())
```

I intalled the "tigris" package. However, I'm not very familiar with it and I found it more simple to use the shape file with the states from https://www.census.gov/geographies/mapping-files/time-series/geo/carto-boundary-file.html .

```{r}
us <- readOGR(dsn = '/Users/davidev7/Documents/Columbia/Second Semester/Data Visualization/Assignments/HW3/cb_2018_us_state_500k', layer = "cb_2018_us_state_500k")
```
```{r}
states_pj$popuptext <- base::paste0("<b>", 
                                 "Number or Projects: ",
                                 "</b><br />",
                                 states_pj$n)
projects_by_state <- merge(us, states_pj, by.x = "STUSPS", by.y = "location_state")
```

```{r}
geocoded_df$popuptext <- base::paste0("<b>", 
                                 "Number or Projects: ",
                                 "</b><br />",
                                 geocoded_df$n)
```

```{r}
bins <- c(200, 500, 1000, 3000, 5000, 8000, 10000, Inf)
pal <- colorBin("RdYlBu", domain = states_pj$n, bins = bins)
```

```{r}
map_innovation <- leaflet(geocoded_df) %>%
  setView(lng = -98.5795, lat = 39.8283, zoom = 2.5) %>%
  addProviderTiles(providers$CartoDB.PositronNoLabels) %>%
  addPolygons(data = projects_by_state, 
            color = "white", 
            weight = 2, 
            smoothFactor = 0.5,
            opacity = 1.0, 
            fillOpacity = 0.9,
            dashArray = "2",
            fillColor = ~pal(n),
            highlightOptions = highlightOptions(color = "red", weight = 2,
                                                bringToFront = TRUE),
            popup = ~popuptext ,
            group = "States",
            label = ~htmlEscape(as.character(NAME))) %>%
  addCircleMarkers(color = "orange",
             label = ~htmlEscape(as.character(location)),
             fill = TRUE,
             lng = ~lng, 
             lat = ~lat,
             popup = ~popuptext,
             group = "Cities") %>%
  addLayersControl(
     baseGroups = c("States", "Cities"),
     options = layersControlOptions(collapsed = FALSE)
  )

map_innovation
```

## 2. Writing your success story

Each project contains a `blurb` -- a short description of the project. While not the full description of the project, the short headline is arguably important for inducing interest in the project (and ultimately popularity and success). Let's analyze the text.

### a) Cleaning the Text and Word Cloud

To reduce the time for analysis, select the 1000 most successful projects and a sample of 1000 unsuccessful projects. Use the cleaning functions introduced in lecture (or write your own in addition) to remove unnecessary words (stop words), syntax, punctuation, numbers, white space etc. Note, that many projects use their own unique brand names in upper cases, so try to remove these fully capitalized words as well (since we are aiming to identify common words across descriptions). Create a document-term-matrix.

Provide a word cloud of the most frequent or important words (your choice which frequency measure you choose) among the most successful projects.


```{r helper functions}
removeNumPunct <- function(x){gsub("[^[:alpha:][:space:][-]]*", "", x)}

# my own functions
removeBrand <- function (x) {gsub("\\b[A-Z]+\\b", "", x)}
subDash <- function (x) {gsub("[-]", " ", x)}

# function seen in class
clean_corpus <- function(corpus){
  corpus <- tm_map(corpus, content_transformer(removeBrand))
  corpus <- tm_map(corpus, content_transformer(removeNumPunct))
  corpus <- tm_map(corpus, content_transformer(subDash))
  corpus <- tm_map(corpus, content_transformer(removePunctuation))
  corpus <- tm_map(corpus, content_transformer(tolower))
  corpus <- tm_map(corpus, removeWords, c(stopwords("en")))
    # Optionally, one could add more stop words
  corpus <- tm_map(corpus, stripWhitespace)
  return(corpus)
}

# following the advice of comparing original text and cleaned
# df_cleaned <- clean_corpus(df)

# finally the stemming process
stemCompletion2 <- function(x, dictionary) {
   x <- unlist(strsplit(as.character(x), " "))
   x <- x[x != ""]
   x <- stemCompletion(x, dictionary=dictionary)
   x <- paste(x, sep="", collapse=" ")
   PlainTextDocument(stripWhitespace(x))
}
```

```{r}
most_successful_pj <- df %>%
  filter(state == "successful") %>%
  arrange(desc(`pledged`)) %>%
  head(1000)

unsuccessful_pj <- df %>%
  filter(state == "failed") %>%
  sample_n(1000)
```

```{r}
df_source_successful <- most_successful_pj %>%
  dplyr::select(`id`, `blurb`) %>%
  rename(doc_id = id, text = blurb)
```

```{r}
df_source_un <- unsuccessful_pj %>%
  dplyr::select(`id`, `blurb`) %>%
  rename(doc_id = id, text = blurb)
```

```{r}
VCorpus(DataframeSource(df_source_successful))
```

```{r}
s_corpus_raw <- Corpus(VectorSource(df_source_successful$text))
s_corpus_raw$meta$id <- most_successful_pj$id

s_corpus_raw[[1]]$content
s_corpus_raw[[2]]$content
s_corpus_raw[[3]]$content
```
```{r}
u_corpus_raw <- Corpus(VectorSource(df_source_un$text))
u_corpus_raw$meta$id <- unsuccessful_pj$id
```


```{r}
s_corpus_cleaned <- clean_corpus(s_corpus_raw)

s_corpus_cleaned[[1]]$content
s_corpus_cleaned[[2]]$content
s_corpus_cleaned[[3]]$content
```


```{r}
s_corpus_stemmed <- tm_map(s_corpus_cleaned, stemDocument)
s_corpus_final <- tm_map(s_corpus_stemmed, stemCompletion2, dictionary = s_corpus_cleaned)
```
```{r}
u_corpus_cleaned <- clean_corpus(u_corpus_raw)
u_corpus_stemmed <- tm_map(u_corpus_cleaned, stemDocument)
u_corpus_final <- tm_map(u_corpus_stemmed, stemCompletion2, dictionary = u_corpus_cleaned)
```

```{r}
successful_dtm <- DocumentTermMatrix(s_corpus_final)
successful_dtm$dimnames$Docs <- as.character(s_corpus_final$meta$id)
```

```{r}
un_dtm <- DocumentTermMatrix(u_corpus_final)
un_dtm$dimnames$Docs <- as.character(u_corpus_final$meta$id)
```

```{r}
successful_dtm_tidy <- tidy(successful_dtm)

successful_terms <- successful_dtm_tidy %>%
  group_by(term) %>%
  summarise(n = sum(count)) %>%
  arrange(desc(n)) %>%
  top_n(100)
```

```{r}
un_dtm_tidy <- tidy(un_dtm)

unsuccessful_terms <- un_dtm_tidy %>%
  group_by(term) %>%
  summarise(n = sum(count)) %>%
  arrange(desc(n)) %>%
  top_n(100)
```

```{r}
my_colors2 <- my_colors[c(8,1,14)]
```

```{r}
wordcloud(words = successful_terms$term, 
          freq = as.integer(successful_terms$n),
          scale = c(4, .5),
          min.freq = 0, 
          max.words = 100,
          colors = my_colors2
          )
```

Since I've already done most of the work, I'm interested in a comparison with failed projects.

```{r}
wordcloud(words = unsuccessful_terms$term, 
          freq = as.integer(unsuccessful_terms$n),
          scale = c(4, .5),
          min.freq = 0, 
          max.words = 100,
          colors = my_colors
          )
```

Apparently a cry for help doesn't help! New appears to be equally frequent in both. While first is a signature for successful projects!

### b) Success in words

Provide a pyramid plot to show how the words between successful and unsuccessful projects differ in frequency. A selection of 10 - 20 top words is sufficient here. 

```{r}
all_successful_terms <- successful_dtm_tidy %>%
  group_by(term) %>%
  summarise(n = sum(count)) %>%
  arrange(desc(n))

all_unsuccessful_terms <- un_dtm_tidy %>%
  group_by(term) %>%
  summarise(n = sum(count)) %>%
  arrange(desc(n))
```

```{r}
terms <- merge(all_successful_terms, all_unsuccessful_terms, by = "term")
terms_for_plot <- terms %>% 
  filter(n.x > 30 | n.y >30)
terms_for_plot <- terms_for_plot[c(1, 7, 16, 19, 24, 27, 20, 29, 33, 35, 36, 50, 52, 53, 41, 42, 48, 12, 9, 3),]
```

```{r}
scol<- my_colors[6]
ucol<- my_colors[14]
```

```{r}
pyramid.plot(terms_for_plot$n.x, terms_for_plot$n.y, labels=terms_for_plot$term,
  main="Words frequency pyramid plot",
  top.labels = c("Successful", "Term", "Unsuccessful"),
  lxcol=scol,rxcol=ucol,
  laxlab = NULL,
             raxlab = NULL,
             unit = NULL,
             labelcex=0.5,
  gap=23,show.values=TRUE)
```

It's a very interesting graph from a psychological perspective. The more the words are "needy" the more is likely the project is going to fail. The same can be said for those projects in which things are not done yet. Words like "will" "make" are associated with failure.


### c) Simplicity as a virtue

These blurbs are short in length (max. 150 characters) but let's see whether brevity and simplicity still matters. Calculate a readability measure (Flesh Reading Ease, Flesh Kincaid or any other comparable measure) for the texts. Visualize the relationship between the readability measure and one of the measures of success. Briefly comment on your finding.

```{r}
s_pj <- most_successful_pj %>%
  dplyr::select(`blurb`, `state`, `goal`, `achievement_ratio`) %>%
  filter(`goal` > 1000)

u_pj <- unsuccessful_pj %>%
  dplyr::select(`blurb`, `state`, `goal`, `achievement_ratio`) %>%
  filter(`goal` > 1000)

df_merged <- rbind(s_pj, u_pj)
```

```{r}
readability_score <- textstat_readability(as.character(df_merged$blurb), 
        measure = c('Flesch','Flesch.Kincaid',
                  'meanSentenceLength','meanWordSyllables'))

readability_score[colnames(df_merged)] <- df_merged
```

```{r}
readability_score %>%
  filter(log(achievement_ratio) > -10) %>%
  
  ggplot(aes(x = log(achievement_ratio), y = Flesch.Kincaid, color = state)) +
  geom_point() +
  geom_smooth(color = my_colors[8]) +
  
  scale_color_manual(values = c(ucol, scol)) +
  
  labs(
    title = "Relationship between Readability and Success", 
    x = "log Achievement Ratio", 
    y = "Flesch-Kincaid Grade Level",
    color = "State"
    ) +
  
  
  theme_tufte() + 
  theme(
    plot.title = element_text(size = 13, face = "bold"),
    plot.caption = element_text(hjust = 0, face = "italic"),
    plot.background = element_rect(),
    
    axis.title.x = element_text(hjust = 0.475),
    
    legend.key = element_rect(fill = "white", colour = "white"),
    legend.background = element_rect()
    )
```

There is a slightly, but negligible, positive relationship between having a description with an higher Flesch-Kincaid score and having an higher Achievement Ratio (Money pledged / Goal, when Goal > 1000). I filtered for Goal > 1000 because many projects set very low (unrealistic) goals. The Flesch-Kincaid scale measures how easily readable a text is. The higher the score the easier the text is easy to read.


## 3. Sentiment

Now, let's check whether the use of positive / negative words or specific emotions helps a project to be successful. 

### a) Stay positive

Calculate the tone of each text based on the positive and negative words that are being used. You can rely on the Hu & Liu dictionary provided in lecture or use the Bing dictionary contained in the tidytext package (`tidytext::sentiments`). Visualize the relationship between tone of the document and success. Briefly comment.

After consulting https://www.tidytextmining.com/sentiment.html, I selected the "nrc" dictionary from Saif Mohammad and Peter Turney as it's the one with more words.

```{r}
dataframe1 <- data.frame(text=sapply(s_corpus_cleaned, identity), 
    stringsAsFactors=F)
dataframe1 <- dataframe1 %>% 
  mutate(index = row_number())

df1.1 <- most_successful_pj %>%
  dplyr::select(`state`, `goal`, `achievement_ratio`) %>%
  mutate(index = row_number())

dataframe1.2 <- left_join(dataframe1, df1.1, by = "index")
```

```{r}
dataframe2 <- data.frame(text=sapply(u_corpus_cleaned, identity), 
    stringsAsFactors=F)
dataframe2 <- dataframe2 %>% 
  mutate(index = row_number())

df2.1 <- unsuccessful_pj %>%
  dplyr::select(`state`, `goal`, `achievement_ratio`) %>%
  mutate(index = row_number())

dataframe2.2 <- left_join(dataframe2, df2.1, by = "index")
```

```{r}
df_merged_cleaned <- rbind(dataframe1.2, dataframe2.2)
```

```{r}
library(tidytext)
library(syuzhet)

polarity_score <- get_sentiment(df_merged_cleaned$text, method="nrc")

df_merged_cleaned$sentiment <- polarity_score
```

```{r}
log(100)
```

When Achievement Rate = 100 the project reached its goal.

```{r}
df_merged_cleaned %>% 
  filter(goal > 1000) %>%
  filter(log(achievement_ratio) > -8) %>%
 
  ggplot(aes(x = sentiment, y = log(achievement_ratio), color = state)) + 
  geom_point() +
  geom_smooth(color = my_colors[8]) +
  
  geom_hline(yintercept = 4.60517) +
  geom_label(
    label="Goal reached", 
    x=6.35,
    y=6,
    label.padding = unit(0.45, "lines"),
    label.size = 0.15,
    color = "black",
    fill="honeydew2"
  ) +
  
  scale_color_manual(values = c(ucol, scol)) +
  
  labs(
    title = "Relationship between Text Sentiment and Success", 
    x = "Text Sentiment", 
    y = "log Achievement Ratio",
    color = "State"
    ) +
  
  
  theme_tufte() + 
  theme(
    plot.title = element_text(size = 13, face = "bold"),
    plot.caption = element_text(hjust = 0, face = "italic"),
    plot.background = element_rect(),
    
    axis.title.x = element_text(hjust = 0.475),
    
    legend.key = element_rect(fill = "white", colour = "white"),
    legend.background = element_rect()
    )
```

This is a weird result. Apparently, once the text of a project goes beyond a slightly positive sentiment, its success chances decrease. However, it might make sense! Extremely positive texts might be suspicious. Projects might appear too good to be true.

### b) Positive vs negative

Segregate all 2,000 blurbs into positive and negative texts based on their polarity score calculated in step (a). Now, collapse the positive and negative texts into two larger documents. Create a document-term-matrix based on this collapsed set of two documents. Generate a comparison cloud showing the most-frequent positive and negative words.  

```{r}
df_merged_cleaned$polarity <- ifelse(df_merged_cleaned$sentiment < 0, "Negative", ifelse(df_merged_cleaned$sentiment >= 0, "Positive", NA))
```

```{r}
df_positive <- df_merged_cleaned %>%
  filter(polarity == "Positive")

df_negative <- df_merged_cleaned %>%
  filter(polarity == "Negative")
```

```{r}
df_3b1 <- unnest_tokens(df_merged_cleaned, output = word, input = text) 

df_3b <- df_3b1 %>%
  group_by(polarity, word) %>%
  summarise(count = n()) %>%
  cast_tdm(word, polarity, count)

matrix3b <- as.matrix(df_3b)
```

```{r}
set.seed(2105)
comparison.cloud(matrix3b, 
                 colors = c(ucol, scol), 
                 title.size= 1.3,
                 max.words = 100,
                 random.order = FALSE)
```

### c) Get in their mind

Now, use the NRC Word-Emotion Association Lexicon in the `tidytext` package to identify a larger set of emotions (anger, anticipation, disgust, fear, joy, sadness, surprise, trust). Again, visualize the relationship between the use of words from these categories and success. What is your finding?

```{r}
sentiments <- get_sentiments("nrc")
```

```{r}
df_sentiments <-  inner_join(df_3b1, sentiments, by = "word")
```

```{r}
df_sentiments %>%
  
  ggplot(aes(x = sentiment.y, y = achievement_ratio, color = state)) + 
  geom_point() +
  
  scale_color_manual(values = c(ucol, scol)) +
  
  labs(
    title = "Relationship between Sentiments and Success", 
    x = "Sentiments", 
    y = "Achievement Ratio",
    color = "State"
    ) +
  
  
  theme_tufte() + 
  theme(
    plot.title = element_text(size = 13, face = "bold"),
    plot.caption = element_text(hjust = 0, face = "italic"),
    plot.background = element_rect(),
    
    legend.key = element_rect(fill = "white", colour = "white"),
    legend.background = element_rect()
    )
```

The most successful projects have words that induce positive emotions like Anticipation, Joy, Positive, Surprise, and Trust. There is also a successful project that has Anger and Negative. However, none of the most successful has Disgust or Fear or Sadness. Moreover, Positive and Trust seem to be the two with the highest correlation with the Achievement Ratio.